#Although I was able to obtain the data from AWS, the Aslett instance would not allow me to install packages therefore I moved the code to my desktop instance

#I was unable to knit the file while pointing to AWS therefore I saved CSV files that I then called off my local directory

Loading Data From S3 URL Using RCurl

#install.packages("RCurl")
#library("RCurl") 
#library(jsonlite)

#cs2 <- read.table(textConnection(getURL(
#  "https://smuddsproject2.s3-us-east-2.amazonaws.com/CaseStudy2-data.csv"
#)), sep=",", header=TRUE)

Loading Data From S3 URL Using RCurl

#install.packages("RCurl")
#library("RCurl") 
#library(jsonlite)

#cs2 <- read.table(textConnection(getURL(
#  "https://smuddsproject2.s3-us-east-2.amazonaws.com/CaseStudy2-data.csv"
#)), sep=",", header=TRUE)

Loading Data From S3 Objects Using the aws.s3 package

#library(tidyverse)
#library(aws.s3)
#library(readxl)

#Sys.setenv("AWS_ACCESS_KEY_ID" = "AKIARVCEVL4C3YMB7QUW",
#           "AWS_SECRET_ACCESS_KEY" = "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",
#           "AWS_DEFAULT_REGION" = "us-east-2")

# Using aws.s3
#aws.s3::bucketlist()
#aws.s3::get_bucket("smuddsproject2")

#Read in Data
#cs2cmpst = s3read_using(FUN = read.csv,
#                        bucket = "smuddsproject2",
#                        object = "CaseStudy2CompSet No Attrition.csv")

#cs2cmpns = s3read_using(FUN = read_excel,
#                         bucket = "smuddsproject2",
#                         object = "CaseStudy2CompSet No Salary.xlsx")

#Save the data obtained from AWS to my desktop/cloud to process #By saving these I was then unable to call the files and then knit the RMD file for completion #My workaround in the eleventh hour!

#Write to CSV files
#write.csv(cs2, "cs2.csv", row.names = F)
#write.csv(cs2cmpst, "cs2cmpst.csv", row.names = F)
#write.csv(cs2cmpns, "cs2cmpns.csv", row.names = F)

#Reading the data off my local directory

cs2 = read.csv("/Users/amyadyanthaya/Desktop/DS6306/cs2/cs2.csv", encoding = "UTF-8")
cs2cmpns = read.csv("/Users/amyadyanthaya/Desktop/DS6306/cs2/cs2.csv", encoding = "UTF-8")
cs2cmpst = read.csv("/Users/amyadyanthaya/Desktop/DS6306/cs2/cs2.csv", encoding = "UTF-8")

This section is to look at the individual variables to determine if they impact Attrition Those variables that showed no impact the plots were removed from this section

#Look for missing observations
anyNA(cs2)
## [1] FALSE
#CONTINIOUS VARIABLES
# Age
cs2 %>% 
  ggplot(aes(x=Age, color=Attrition, fill=Attrition)) +
  geom_histogram() + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Age by Attrition") + 
  xlab("Age (Years)") + ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cs2 %>% 
 group_by(Attrition) %>%
  summarize(medianAge = median(Age)) %>%
  ggplot(aes(x=Attrition, y=medianAge, fill = Attrition)) +
  geom_col() + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Median Age by Attrition") + 
  xlab("Attrition")+ ylab("Median Age (Years)") + 
  theme(legend.position = "none")

# DistanceFromHome
cs2 %>% 
  ggplot(aes(x=DistanceFromHome, color=Attrition, fill=Attrition)) +
  geom_histogram() + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Distance from Home by Attrition") + 
  xlab("Distance From Home") + ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cs2 %>% 
  group_by(Attrition) %>%
  summarize(medianDistanceFromHome = median(DistanceFromHome),count=n()) %>%
  ggplot(aes(x=Attrition, y=medianDistanceFromHome, fill = Attrition)) +
  geom_col() + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Median Distance From Home by Attrition") + 
  xlab("Attrition")+ ylab("Median Distance from Home") + 
  theme(legend.position = "none")

# NumCompaniesWorked
cs2 %>% 
  ggplot(aes(x=NumCompaniesWorked, color=Attrition, fill=Attrition)) +
  geom_histogram() + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Number of Companies Worked by Attrition") + 
  xlab("Number of Companies Worked") + ylab("") + xlim(0,8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 33 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 4 rows containing missing values (`geom_bar()`).

cs2 %>% 
  group_by(Attrition) %>%
  summarize(medianNumCompaniesWorked = median(NumCompaniesWorked),count=n()) %>%
  ggplot(aes(x=Attrition, y=medianNumCompaniesWorked, fill=Attrition)) +
  geom_col() + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Median Number of Companies Worked by Attrition") + 
  xlab("Attrition")+ ylab("Median Number of Companies Worked") + 
  theme(legend.position = "none")

# YearsAtCompany
cs2 %>% 
  ggplot(aes(x=YearsAtCompany, color=Attrition, fill=Attrition)) +
  geom_histogram() + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Years at Company by Attrition") + 
  xlab("Years at Company") + ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cs2 %>% 
  group_by(Attrition) %>%
  summarize(medianYearsAtCompany = median(YearsAtCompany),count=n()) %>%
  ggplot(aes(x=Attrition, y=medianYearsAtCompany, fill=Attrition)) +
  geom_col() + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Median Years at Company by Attrition") + 
  xlab("Attrition")+ ylab("Median Years at Company") + 
  theme(legend.position = "none")

# YearsInCurrentRole
cs2 %>% 
  ggplot(aes(x=YearsInCurrentRole, color=Attrition, fill=Attrition)) +
  geom_histogram() + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Years in Current Role by Attrition") + 
  xlab("Years in Current Role") + ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cs2 %>% 
  group_by(Attrition) %>%
  summarize(medianYearsInCurrentRole = median(YearsInCurrentRole),count=n()) %>%
  ggplot(aes(x=Attrition, y=medianYearsInCurrentRole, fill=Attrition)) +
  geom_col() + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Median Years in Current Role by Attrition") + 
  xlab("Attrition")+ ylab("Median Years in Current Role") + 
  theme(legend.position = "none")

# YearsWithCurrManager
cs2 %>% 
  ggplot(aes(x=YearsWithCurrManager, color=Attrition, fill=Attrition)) +
  geom_histogram() + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Years with Current Manager by Attrition") + 
  xlab("Years with Current Manager") + ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cs2 %>% 
  group_by(Attrition) %>%
  summarize(medianYearsWithCurrManager = median(YearsWithCurrManager),count=n()) %>%
  ggplot(aes(x=Attrition, y=medianYearsWithCurrManager, fill=Attrition)) +
  geom_col()  + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Median Years with Current Manager by Attrition") + 
  xlab("Attrition")+ ylab("Median Years with Current Manager") + 
  theme(legend.position = "none")

#FACTORS
# Education
cs2$EducationF = as.factor(cs2$Education)
#summary(cs2$EducationF)
cs2 %>% 
  ggplot(aes(x=EducationF, color=Attrition, fill=Attrition)) +
  geom_bar() + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Education by Attrition") + 
  xlab("Education") + ylab("") 

# EnvironmentSatisifaction
cs2$EnvironmentSatisfactionF = as.factor(cs2$EnvironmentSatisfaction)
#summary(cs2$EnvironmentSatisfactionF)
cs2 %>% 
  ggplot(aes(x=EnvironmentSatisfactionF, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Environment Satisfaction by Attrition") + 
  xlab("Environment Satisfaction") + ylab("") 

# JobInvolvement
cs2$JobInvolvementF = as.factor(cs2$JobInvolvement)
#summary(cs2$JobInvolvementF)
cs2 %>% 
  ggplot(aes(x=JobInvolvementF, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Job Involvement by Attrition") + 
  xlab("Job Involvement") + ylab("") 

# JobLevel
cs2$JobLevelF = as.factor(cs2$JobLevel)
cs2 %>% 
  ggplot(aes(x=JobLevelF, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Job Level by Attrition") + 
  xlab("Job Level") + ylab("") 

# JobSatification  
cs2$JobSatisfactionF = as.factor(cs2$JobSatisfaction)
cs2 %>% 
  ggplot(aes(x=JobSatisfactionF, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Job Satisfaction by Attrition") + 
  xlab("Job Satisfaction") + ylab("") 

# PerformanceRating  
cs2$PerformanceRatingF = as.factor(cs2$PerformanceRating)
cs2 %>% 
  ggplot(aes(x=PerformanceRatingF, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Performance Rating by Attrition") + 
  xlab("Performance Rating") + ylab("") 

# RelationshipSatification
cs2$RelationshipSatisfactionF = as.factor(cs2$RelationshipSatisfaction)
#summary(cs2$RelationshipSatisfactionF)
cs2 %>% 
  ggplot(aes(x=RelationshipSatisfactionF, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Relationship Satisfaction by Attrition") + 
  xlab("Relationship Satisfaction") + ylab("") 

# StockOptionLevel
cs2$StockOptionLevelF = as.factor(cs2$StockOptionLevel)
cs2 %>% 
  ggplot(aes(x=StockOptionLevelF, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Stock Option Level by Attrition") + 
  xlab("Stock Option Level") + ylab("") 

# WorkLifeBalance
cs2$WorkLifeBalanceF = as.factor(cs2$WorkLifeBalance)
cs2 %>% 
  ggplot(aes(x=WorkLifeBalanceF, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Work Life Balance by Attrition") + 
  xlab("Work Life Balance") + ylab("") 

#CATEGORICAL

# BusinessTravel
cs2 %>% 
  proc_freq("BusinessTravel", "Attrition",
              include.row_percent = FALSE,
            include.column_percent = FALSE)
cs2 %>% 
  ggplot(aes(x=BusinessTravel, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Business Travel by Attrition") + 
  xlab("Business Travel") + ylab("")

# Department
cs2 %>% 
  proc_freq("Department", "Attrition",
              include.row_percent = FALSE,
            include.column_percent = FALSE)
cs2 %>% 
  ggplot(aes(x=Department, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution Department by Attrition") + 
  xlab("Department") + ylab("")

# EducationField
cs2 %>% 
  proc_freq("EducationField", "Attrition",
              include.row_percent = FALSE,
            include.column_percent = FALSE)
cs2 %>% 
  ggplot(aes(x=EducationField, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution Education Field by Attrition") + 
  xlab("Education Field") + ylab("")

# Gender
cs2 %>% 
  proc_freq("Gender", "Attrition",
              include.row_percent = FALSE,
            include.column_percent = FALSE)
cs2 %>% 
  ggplot(aes(x=Gender, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Gender by Attrition") + 
  xlab("Gender") + ylab("")

# MaritalStatus
cs2 %>% 
  proc_freq("MaritalStatus", "Attrition",
              include.row_percent = FALSE,
            include.column_percent = FALSE)
cs2 %>% 
  ggplot(aes(x=MaritalStatus, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Marital Status by Attrition") + 
  xlab("Marital Status") + ylab("")

# Overtime
cs2 %>% 
  proc_freq("OverTime", "Attrition",
              include.row_percent = FALSE,
            include.column_percent = FALSE)
cs2 %>% 
  ggplot(aes(x=OverTime, color=Attrition, fill=Attrition)) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Distribution of Overtime by Attrition") + 
  xlab("Marital Status") + ylab("")

After looking at the relationship between Attrition and individual variables it was determined that TotalWorkingYears, MonthlyIncome and JobRole were the top three factors that contribute to Attrition

cs2 %>% 
  select(TotalWorkingYears, MonthlyIncome, JobRole) %>%
  ggpairs(mapping = aes(color = cs2$Attrition)) + 
  scale_fill_viridis(discrete = TRUE) +
  scale_color_viridis(discrete = TRUE) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Impact of TotalWorkingYears on Attrition

cs2 %>% 
  group_by(Attrition) %>%
  summarize(meanTWH = mean(TotalWorkingYears),
            sdTWH = sd(TotalWorkingYears),
            medianTWH = median(TotalWorkingYears),
            rangeTWH = max(TotalWorkingYears)-min(TotalWorkingYears),
            IQRTWH = IQR(TotalWorkingYears),
            countTWH = n())
## # A tibble: 2 × 7
##   Attrition meanTWH sdTWH medianTWH rangeTWH IQRTWH countTWH
##   <chr>       <dbl> <dbl>     <dbl>    <int>  <dbl>    <int>
## 1 No          11.6   7.46      10         37      9      730
## 2 Yes          8.19  7.16       6.5       40      7      140
cs2 %>% 
  ggplot(aes(x=TotalWorkingYears, fill=Attrition)) + 
  geom_histogram() + scale_fill_viridis(discrete = TRUE) +
  ggtitle("Impact of Total Working Years on Attrition") + 
  xlab("Total Working Years") + 
  ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cs2 %>% 
  ggplot(aes(y=TotalWorkingYears, color=Attrition)) + 
  geom_boxplot() +
  scale_color_viridis(discrete = TRUE) +
  ggtitle("Comparison of Total Working Years and Attrition") + 
  ylab("Total Working Years") 

#Create factor cuts
cs2$TWYcut = cut(cs2$TotalWorkingYears, breaks = c(0,5,10,15,20,25,30,35,100), labels = c("< 5","5-10","10-15","15-20","20-25","25-30","30-35",">35"), right=FALSE)

freqTWY <- cs2 %>% 
  proc_freq("TWYcut", "Attrition",
            include.row_percent = FALSE,
            include.column_percent = FALSE,
            include.table_percent = FALSE,
            include.column_total = FALSE,
            include.row_total = FALSE,
            )
 
freqTWY$body$dataset$PctAtt <- (freqTWY$body$dataset$Yes/(freqTWY$body$dataset$Yes + freqTWY$body$dataset$No))*100
freqTWY$body$dataset
##   TWYcut     label  No Yes    PctAtt
## 1    < 5 Frequency  91  46 33.576642
## 2   5-10 Frequency 241  46 16.027875
## 3  10-15 Frequency 196  26 11.711712
## 4  15-20 Frequency  85  11 11.458333
## 5  20-25 Frequency  65   6  8.450704
## 6  25-30 Frequency  25   2  7.407407
## 7  30-35 Frequency  20   2  9.090909
## 8    >35 Frequency   7   1 12.500000
cs2 %>% 
  ggplot(aes(x=TWYcut, color=Attrition, fill=Attrition)) +
  geom_bar() +  
  scale_color_viridis(discrete = TRUE) +
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Impact of Attrition per Total Working Years") +
  xlab("Total Working Years") + ylab("")

#Not a normal distribution so use KW test
# H_0: The poulation distributions contain similar medians for Attrition
# H_A: At least one population distribution differs for attrition
kruskal.test(TotalWorkingYears~Attrition, data=cs2)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  TotalWorkingYears by Attrition
## Kruskal-Wallis chi-squared = 34.605, df = 1, p-value = 4.038e-09

Reject Ho, there is overwhelming evidence that suggests Total Working Years has an impact on the outcome of Attrition. 33.6% of attritions occur less then five total working years with nearly 50% of attritions occuring within 10 years of total working.

#Impact of MonthlyIncome on Attrition

cs2 %>% 
  group_by(Attrition) %>%
  summarize(meanMI = mean(MonthlyIncome),
            medianMI = median(MonthlyIncome),
            rangeMI = max(MonthlyIncome)-min(MonthlyIncome),
            IQRMI = IQR(MonthlyIncome),
            IQRMI = IQR(MonthlyIncome),
            countMI = n())
## # A tibble: 2 × 6
##   Attrition meanMI medianMI rangeMI IQRMI countMI
##   <chr>      <dbl>    <dbl>   <int> <dbl>   <int>
## 1 No         6702     5208.   18870 5574.     730
## 2 Yes        4765.    3171    18778 3497.     140
cs2 %>% 
  ggplot(aes(x=MonthlyIncome, fill=Attrition)) + geom_histogram() + 
  scale_fill_viridis(discrete = TRUE) +
  scale_color_viridis(discrete = TRUE) +
  ggtitle("Impact of Monthly Income on Attrition") + 
  xlab("Monthly Income") + ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cs2 %>% 
  ggplot(aes(x=MonthlyIncome, color=Attrition)) + geom_boxplot() + scale_color_viridis(discrete = TRUE) +
  ggtitle("Impact of Monthly Income and Attrition") + xlab("Monthly Income") + ylab("")

#Not a normal distribution so use KW test
# H_0: The poulation distributions contain similar medians for Attrition
# H_A: At least one population distribution differs for attrition
kruskal.test(MonthlyIncome~Attrition, data=cs2)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  MonthlyIncome by Attrition
## Kruskal-Wallis chi-squared = 34.59, df = 1, p-value = 4.069e-09

Reject Ho, there is overwhelming evidence that suggests MonthlyIncome has an impact on the outcome of Attrition. It appears that more attrition occurs with a lower monthly income. The median monthly income is 24% lower for employee attrtion, compared to non-attrition.

#Impact of Job Involvement on Attrition

cs2 %>% 
  ggplot(aes(x=JobInvolvementF, color=Attrition, fill=Attrition)) +
  geom_bar() + scale_fill_viridis(discrete = TRUE) + 
  ggtitle("Distribution of Job Involvement and Attrition") + 
  xlab("Job Involvement") + ylab("")

freqJI <- cs2 %>% 
  proc_freq("JobInvolvementF", "Attrition",
            include.row_percent = FALSE,
            include.column_percent = FALSE,
            include.table_percent = FALSE,
            include.column_total = FALSE,
            include.row_total = FALSE,
            )

freqJI$body$dataset$PctAtt <- (freqJI$body$dataset$Yes/(freqJI$body$dataset$Yes + freqJI$body$dataset$No))*100
freqJI$body$dataset
##   JobInvolvementF     label  No Yes    PctAtt
## 1               1 Frequency  25  22 46.808511
## 2               2 Frequency 184  44 19.298246
## 3               3 Frequency 447  67 13.035019
## 4               4 Frequency  74   7  8.641975
freqJIc <- cbind(freqJI$body$dataset$Yes, freqJI$body$dataset$No)

#Ho: Attrition is independent of Job Involvement
#Ha: Attriton is not independent of Job Involvement
chisq.test(freqJIc)
## 
##  Pearson's Chi-squared test
## 
## data:  freqJIc
## X-squared = 41.465, df = 3, p-value = 5.211e-09

Reject Ho, there is overwhelming evidence that suggests Job Involvement has an impact on the outcome of Attrition. The data does not provide text explaining each job involvement level however that it may be assumed that desirable job involvement increase with each level. Almost 50% of the attrition occurs at the lowest level of job involvement (1), which includes the least number of employees.

#Impact of Job Role on Attrition

cs2 %>% 
  ggplot(aes(x=JobRole, color=Attrition, fill=Attrition)) +
  geom_bar() + scale_fill_viridis(discrete = TRUE) + 
  scale_x_discrete(guide = guide_axis(angle = 60)) +
  ggtitle("Distribution of Job Role and Attrition") + 
  xlab("Job Role") + ylab("")

freqJR <- cs2 %>% 
  proc_freq("JobRole", "Attrition",
            include.row_percent = FALSE,
            include.column_percent = FALSE,
            include.table_percent = FALSE,
            include.column_total = FALSE,
            include.row_total = FALSE,
            )

freqJR$body$dataset$PctAtt <- (freqJR$body$dataset$Yes/(freqJR$body$dataset$Yes + freqJR$body$dataset$No))*100
freqJR$body$dataset
##                     JobRole     label  No Yes    PctAtt
## 1 Healthcare Representative Frequency  68   8 10.526316
## 2           Human Resources Frequency  21   6 22.222222
## 3     Laboratory Technician Frequency 123  30 19.607843
## 4                   Manager Frequency  47   4  7.843137
## 5    Manufacturing Director Frequency  85   2  2.298851
## 6         Research Director Frequency  50   1  1.960784
## 7        Research Scientist Frequency 140  32 18.604651
## 8           Sales Executive Frequency 167  33 16.500000
## 9      Sales Representative Frequency  29  24 45.283019
freqJRc <- cbind(freqJR$body$dataset$Yes, freqJR$body$dataset$No)

#Ho: Attrition is independent of Job Role
#Ha: Attriton is not independent of Job Role
chisq.test(freqJRc)
## Warning in chisq.test(freqJRc): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  freqJRc
## X-squared = 60.543, df = 8, p-value = 3.647e-10

Reject Ho, there is overwhelming evidence that suggests Job Role has an impact on the outcome of Attrition. Almost half (45.3%) of the attrition occurs for the Sales Representatives.

#Fit a model with the variables to identify those that may assist with the prediciting of the model
#Kitchen Sink
cs2$AttritionT <-ifelse(cs2$Attrition=="Yes",1,0)
fitR = lm(AttritionT ~ MonthlyIncome+TotalWorkingYears+JobRole+JobSatisfactionF+WorkLifeBalanceF+PercentSalaryHike+YearsSinceLastPromotion+EducationF+EducationField+EmployeeNumber+EnvironmentSatisfaction+Gender+PerformanceRatingF+Department+DistanceFromHome+Age+BusinessTravel+JobInvolvementF+JobLevelF+MaritalStatus+NumCompaniesWorked+OverTime+RelationshipSatisfactionF+StockOptionLevelF+TrainingTimesLastYear+YearsAtCompany+YearsInCurrentRole+YearsSinceLastPromotion+YearsWithCurrManager + HourlyRate + MonthlyRate + PercentSalaryHike + DailyRate, data=cs2)
summary(fitR)
## 
## Call:
## lm(formula = AttritionT ~ MonthlyIncome + TotalWorkingYears + 
##     JobRole + JobSatisfactionF + WorkLifeBalanceF + PercentSalaryHike + 
##     YearsSinceLastPromotion + EducationF + EducationField + EmployeeNumber + 
##     EnvironmentSatisfaction + Gender + PerformanceRatingF + Department + 
##     DistanceFromHome + Age + BusinessTravel + JobInvolvementF + 
##     JobLevelF + MaritalStatus + NumCompaniesWorked + OverTime + 
##     RelationshipSatisfactionF + StockOptionLevelF + TrainingTimesLastYear + 
##     YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + 
##     YearsWithCurrManager + HourlyRate + MonthlyRate + PercentSalaryHike + 
##     DailyRate, data = cs2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57471 -0.19881 -0.06755  0.11703  1.10963 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       8.849e-01  2.073e-01   4.269 2.20e-05 ***
## MonthlyIncome                    -2.473e-06  1.081e-05  -0.229 0.819111    
## TotalWorkingYears                -4.655e-03  3.387e-03  -1.374 0.169759    
## JobRoleHuman Resources            7.900e-02  1.546e-01   0.511 0.609582    
## JobRoleLaboratory Technician     -1.975e-02  5.728e-02  -0.345 0.730286    
## JobRoleManager                   -3.975e-02  9.699e-02  -0.410 0.682025    
## JobRoleManufacturing Director    -5.312e-02  5.052e-02  -1.051 0.293370    
## JobRoleResearch Director         -1.219e-01  7.868e-02  -1.550 0.121617    
## JobRoleResearch Scientist        -8.051e-02  5.753e-02  -1.399 0.162074    
## JobRoleSales Executive            3.477e-02  1.065e-01   0.326 0.744264    
## JobRoleSales Representative       1.491e-01  1.202e-01   1.241 0.214990    
## JobSatisfactionF2                -4.279e-02  3.440e-02  -1.244 0.213898    
## JobSatisfactionF3                -4.797e-02  3.111e-02  -1.542 0.123462    
## JobSatisfactionF4                -1.242e-01  3.075e-02  -4.039 5.88e-05 ***
## WorkLifeBalanceF2                -1.699e-01  5.135e-02  -3.308 0.000980 ***
## WorkLifeBalanceF3                -1.906e-01  4.826e-02  -3.949 8.52e-05 ***
## WorkLifeBalanceF4                -1.889e-01  5.632e-02  -3.353 0.000836 ***
## PercentSalaryHike                 2.151e-03  4.692e-03   0.458 0.646810    
## YearsSinceLastPromotion           1.202e-02  4.563e-03   2.633 0.008618 ** 
## EducationF2                       5.509e-02  4.029e-02   1.367 0.171890    
## EducationF3                       3.866e-02  3.703e-02   1.044 0.296734    
## EducationF4                       2.723e-02  3.999e-02   0.681 0.496069    
## EducationF5                       7.276e-04  7.225e-02   0.010 0.991967    
## EducationFieldLife Sciences      -2.043e-01  1.088e-01  -1.877 0.060829 .  
## EducationFieldMarketing          -1.952e-01  1.157e-01  -1.687 0.091956 .  
## EducationFieldMedical            -2.194e-01  1.089e-01  -2.014 0.044380 *  
## EducationFieldOther              -2.006e-01  1.167e-01  -1.718 0.086123 .  
## EducationFieldTechnical Degree   -1.248e-01  1.134e-01  -1.100 0.271582    
## EmployeeNumber                   -1.713e-05  1.804e-05  -0.949 0.342813    
## EnvironmentSatisfaction          -2.700e-02  9.904e-03  -2.726 0.006545 ** 
## GenderMale                        1.411e-02  2.203e-02   0.640 0.522186    
## PerformanceRatingF4               1.966e-02  4.792e-02   0.410 0.681704    
## DepartmentResearch & Development  1.737e-01  1.421e-01   1.222 0.221892    
## DepartmentSales                   1.763e-01  1.466e-01   1.203 0.229464    
## DistanceFromHome                  4.372e-03  1.348e-03   3.244 0.001229 ** 
## Age                              -3.373e-03  1.678e-03  -2.010 0.044737 *  
## BusinessTravelTravel_Frequently   9.354e-02  4.174e-02   2.241 0.025288 *  
## BusinessTravelTravel_Rarely       3.207e-02  3.556e-02   0.902 0.367377    
## JobInvolvementF2                 -2.170e-01  5.087e-02  -4.265 2.23e-05 ***
## JobInvolvementF3                 -2.698e-01  4.879e-02  -5.530 4.32e-08 ***
## JobInvolvementF4                 -3.114e-01  5.863e-02  -5.312 1.41e-07 ***
## JobLevelF2                       -1.422e-01  4.887e-02  -2.910 0.003716 ** 
## JobLevelF3                       -6.000e-02  8.092e-02  -0.741 0.458669    
## JobLevelF4                       -6.398e-02  1.280e-01  -0.500 0.617223    
## JobLevelF5                        8.250e-02  1.596e-01   0.517 0.605274    
## MaritalStatusMarried              4.050e-02  2.970e-02   1.364 0.173062    
## MaritalStatusSingle               3.735e-02  4.670e-02   0.800 0.424115    
## NumCompaniesWorked                1.878e-02  4.956e-03   3.790 0.000162 ***
## OverTimeYes                       2.131e-01  2.385e-02   8.935  < 2e-16 ***
## RelationshipSatisfactionF2       -6.246e-02  3.456e-02  -1.807 0.071115 .  
## RelationshipSatisfactionF3       -8.228e-02  3.132e-02  -2.627 0.008771 ** 
## RelationshipSatisfactionF4       -8.683e-02  3.101e-02  -2.801 0.005224 ** 
## StockOptionLevelF1               -1.363e-01  3.622e-02  -3.762 0.000180 ***
## StockOptionLevelF2               -1.286e-01  4.880e-02  -2.635 0.008571 ** 
## StockOptionLevelF3               -1.341e-02  5.562e-02  -0.241 0.809505    
## TrainingTimesLastYear            -1.758e-02  8.571e-03  -2.052 0.040528 *  
## YearsAtCompany                    4.828e-03  4.037e-03   1.196 0.232136    
## YearsInCurrentRole               -6.639e-03  5.061e-03  -1.312 0.189991    
## YearsWithCurrManager             -5.525e-03  4.998e-03  -1.105 0.269302    
## HourlyRate                        5.815e-04  5.391e-04   1.079 0.280996    
## MonthlyRate                      -3.404e-07  1.537e-06  -0.221 0.824782    
## DailyRate                        -2.133e-05  2.708e-05  -0.788 0.431163    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3097 on 808 degrees of freedom
## Multiple R-squared:  0.3402, Adjusted R-squared:  0.2903 
## F-statistic: 6.828 on 61 and 808 DF,  p-value: < 2.2e-16

Naive Bayes Find the average accuracy of 100 train/test splits Model determined from LM of all dataset variables Selection from those that showed signifiance from the model

# Use for the train/test & preditcions
NB = cs2 %>% select(Attrition,TotalWorkingYears,MonthlyIncome,YearsSinceLastPromotion, PerformanceRatingF, Age, OverTime, RelationshipSatisfactionF, JobInvolvementF , BusinessTravel)

#testNB2 will be used for the prediction of the attritions
testNB2 = cs2cmpst %>% select(ID,TotalWorkingYears,MonthlyIncome,YearsSinceLastPromotion, PerformanceRating, Age, OverTime, RelationshipSatisfaction, JobInvolvement, BusinessTravel)

testNB2$PerformanceRatingF = as.factor(testNB2$PerformanceRating)
testNB2$RelationshipSatisfactionF = as.factor(testNB2$RelationshipSatisfaction)
testNB2$JobInvolvementF = as.factor(testNB2$JobInvolvement)
 
AccHolder = numeric(100)
SensHolder = numeric(100)
SpecHolder = numeric(100)
 
for (seed in 1:100)
{
set.seed(seed)
trainIndicies = sample(1:dim(NB)[1],round(.75 * dim(NB)[1]))
trainNB = NB[trainIndicies,]
testNB = NB[-trainIndicies,]
 
modelNB = naiveBayes(trainNB[,c("TotalWorkingYears","MonthlyIncome","YearsSinceLastPromotion","PerformanceRatingF","Age","OverTime","RelationshipSatisfactionF","JobInvolvementF","BusinessTravel")], trainNB$Attrition)

PredNB = table(predict(modelNB,testNB), testNB$Attrition)

CMNB = confusionMatrix(table(testNB$Attrition,predict(modelNB, testNB[,c("TotalWorkingYears","MonthlyIncome","YearsSinceLastPromotion","PerformanceRatingF","Age","OverTime","RelationshipSatisfactionF","JobInvolvementF","BusinessTravel")])))     
 
AccHolder[seed] = CMNB$overall[1]
SensHolder[seed] = CMNB$byClass[1]
SpecHolder[seed] = CMNB$byClass[2]
}

#Plot of the 100 mean Accuracies
MACC <- data.frame(AccHolder)
MACC %>% ggplot(aes(x=AccHolder))+
  geom_histogram(fill="blue",color="black",bins=40) +
  ggtitle("Mean Accuracy =",round(mean(AccHolder), digits = 4)) + 
  xlab("Model Accuracies") +
  ylab("Frequency")

#Plot of the 100 mean Sensativities
MSEN <- data.frame(SensHolder)
MSEN %>% ggplot(aes(x=SensHolder))+
  geom_histogram(fill="red",color="black", bins=40) +
  ggtitle("Mean Sensitivity =",round(mean(SensHolder), digits = 4)) + 
  xlab("Model Sensitivities") +
  ylab("Frequency")

#Plot of the 100 mean Specificities
MSPC <- data.frame(SpecHolder)
MSPC %>% ggplot(aes(x=SpecHolder))+
  geom_histogram(fill="green",color="black",bins=40) +
  ggtitle("Mean Specificity =",round(mean(SpecHolder), digits = 4)) + 
  xlab("Model Specificities") +
  ylab("Frequency")

#Model to use for the creatio  of the predicted attritions 
modelNB2 = naiveBayes(NB[,c("TotalWorkingYears","MonthlyIncome","YearsSinceLastPromotion","PerformanceRatingF","Age","OverTime","RelationshipSatisfactionF","JobInvolvementF","BusinessTravel")], NB$Attrition)

#Create the Attrition Prediction CSV file
Case2PredictionsAdyanthayaAttritions <- data.frame(testNB2 %>% select(ID),predict(modelNB,testNB2))
Case2PredictionsAdyanthayaAttritions <- setNames(Case2PredictionsAdyanthayaAttritions, c("ID","Attrition"))
#write to csv file
write.csv(Case2PredictionsAdyanthayaAttritions, "Case2PredictionsAdyanthayaAttritions.csv", row.names = F)

#Accuracy
mean(AccHolder)
## [1] 0.8541743
sd(AccHolder)/sqrt(100)
## [1] 0.002240888
#Sensitivity
mean(SensHolder)
## [1] 0.8751597
sd(SensHolder)/sqrt(100)
## [1] 0.002156709
#Specificity
mean(SpecHolder)
## [1] 0.6169045
sd(SpecHolder)/sqrt(100)
## [1] 0.01309779

The mean accuracy of the model was calculated to be 85.4%. This reflects the number of correctly classified observations. The mean sensitivity of the model was calculated to be 87,5%. This reflects the number of correctly identified attritions that are truly employee attritions. The mean specificity of the model was calculated to be 61.7%. This reflects the number of correctly identified non-attritions that are truly employee non-attritions.

knn Find the average accuracy of 100 train/test splits Model determined from LM of all dataset variables Selection from those that showed signifiance from the model

cs2knn <- cs2 %>% select(Attrition, Age, BusinessTravel, MonthlyIncome, OverTime, TWYcut, JobInvolvement, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance, YearsSinceLastPromotion, NumCompaniesWorked, StockOptionLevel, TrainingTimesLastYear, DistanceFromHome, EducationField)
cs2knn$Attrition <-  ifelse(cs2knn$Attrition=="Yes",1,0)
cs2knn$OverTime <-  ifelse(cs2knn$OverTime=="Yes",1,0)
cs2knn$BusinessTravel <- ifelse(cs2knn$BusinessTravel=="Non-Travel",1,
                                ifelse(cs2knn$BusinessTravel=="Travel_Rarely",2,3))
#cs2knn$MaritalStatus <- ifelse(cs2knn$MaritalStatus=="Single",1,
#                               ifelse(cs2knn$MaritalStatus=="Married",2,3))
cs2knn$EducationField <- ifelse(cs2knn$EducationField=="Life Sciences",1,
                                ifelse(cs2knn$EducationField=="Marketing",2,
                                       ifelse(cs2knn$EducationField=="Medical",3,
                                              ifelse(cs2knn$EducationField=="Technical Degree",4,5))))

cs2knn$Age = as.numeric(cs2knn$Age)
cs2knn$MonthlyIncome = as.numeric(cs2knn$MonthlyIncome)
#cs2knn$TotalWorkingYears = as.numeric(cs2knn$TotalWorkingYears)
cs2knn$JobInvolvement = as.numeric(cs2knn$JobInvolvement)
cs2knn$PerformanceRating = as.numeric(cs2knn$PerformanceRating)
cs2knn$RelationshipSatisfaction = as.numeric(cs2knn$RelationshipSatisfaction)
cs2knn$WorkLifeBalance = as.numeric(cs2knn$WorkLifeBalance)
cs2knn$YearsSinceLastPromotion = as.numeric(cs2knn$YearsSinceLastPromotion)
cs2knn$NumCompaniesWorked = as.numeric(cs2knn$NumCompaniesWorked)
cs2knn$StockOptionLevel = as.numeric(cs2knn$StockOptionLevel)
#cs2knn$YearsWithCurrManager = as.numeric(cs2knn$YearsWithCurrManager)
cs2knn$TrainingTimesLastYear = as.numeric(cs2knn$TrainingTimesLastYear)
cs2knn$DistanceFromHome = as.numeric(cs2knn$DistanceFromHome)
cs2knn$TWYcut = as.numeric(cs2knn$TWYcut)



set.seed(1)
iterations = 100
numks = 5
splitPerc = .75

masterAcc = matrix(nrow = iterations, ncol = numks)
masterSens = matrix(nrow = iterations, ncol = numks)
masterSpec = matrix(nrow = iterations, ncol = numks)

for(j in 1:iterations)
{
  trainIndicies = sample(1:dim(cs2knn)[1],round(splitPerc * dim(cs2knn)[1]))
  trainKnn = cs2knn[trainIndicies,]
  testKnn = cs2knn[-trainIndicies,]
  for(i in 1:numks)
  {
    modelKnn = knn(trainKnn[,c("Attrition", "Age", "BusinessTravel", "MonthlyIncome",
                               "OverTime", "TWYcut", "JobInvolvement",
                               "PerformanceRating", "RelationshipSatisfaction","WorkLifeBalance", "YearsSinceLastPromotion","NumCompaniesWorked","StockOptionLevel","TrainingTimesLastYear","DistanceFromHome","EducationField")],
                   testKnn[,c("Attrition", "Age", "BusinessTravel", "MonthlyIncome", "OverTime",
                              "TWYcut", "JobInvolvement", "PerformanceRating",
                              "RelationshipSatisfaction","WorkLifeBalance","YearsSinceLastPromotion","NumCompaniesWorked","StockOptionLevel","TrainingTimesLastYear","DistanceFromHome","EducationField")],
                   trainKnn$Attrition, prob = TRUE, k=i)
    table(modelKnn,testKnn$Attrition)
    CMKnn = confusionMatrix(table(modelKnn,testKnn$Attrition))
    masterAcc[j,i] = CMKnn$overall[1]
    masterSens[j,i] = CMKnn$byClass[1]
    masterSpec[j,i] = CMKnn$byClass[2]
  }
}

#Accuracy
MeanAcc = colMeans(masterAcc)
which.max(MeanAcc)
## [1] 5
max(MeanAcc)
## [1] 0.8237615
#Sensitivity
MeanSens = colMeans(masterSens)
which.max(MeanSens)
## [1] 5
max(MeanSens)
## [1] 0.9695651
#Specificity
MeanSpec = colMeans(masterSpec)
which.max(MeanSpec)
## [1] 1
max(MeanSpec)
## [1] 0.2074003

Unfortunately I was not able to find an ideal model that would improve the specificity.

#JOB ROLE

#Overview of Age, Education, Monthly Income and Business Travel on Job Role
cs2 %>% 
  select(Age,EducationF, MonthlyIncome, BusinessTravel) %>%
  ggpairs(mapping = aes(color = cs2$JobRole))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Impact of Age on Job Role

# Age
cs2 %>%
  ggplot(aes(y=Age, color=JobRole)) +
  geom_boxplot() +
  scale_color_viridis(discrete = TRUE) + 
  ggtitle("Age & Job Role") + 
  xlab("") + ylab("Age (Years)") 

#Not a normal distribution so use KW test
# H_0: The poulation distributions contain similar medians for JobRole
# H_A: At least one population distribution differs for JobRole
cs2AJR <- kruskal.test(Age~JobRole, data=cs2)
cs2AJR
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Age by JobRole
## Kruskal-Wallis chi-squared = 158.99, df = 8, p-value < 2.2e-16

Reject Ho, there is overwhelming evidence that suggests Age has an impact on Job Role. It appears that leadership roles (e.g. directors & executives) tends to have employees that are older in age whereas entry level positions have younger employees.

#Impact of Total Working Years on Job Role

cs2 %>%
  ggplot(aes(y=TotalWorkingYears, color=JobRole)) +
  geom_boxplot() +
  scale_color_viridis(discrete = TRUE) + 
  ggtitle("Total Working Yeara & Job Role") + 
  xlab("") + ylab("Total Working Years") 

#Not a normal distribution so use KW test
# H_0: The poulation distributions contain similar medians for JobRole
# H_A: At least one population distribution differs for JobRole
cs2TWJR <- kruskal.test(TotalWorkingYears~JobRole, data=cs2)
cs2TWJR
## 
##  Kruskal-Wallis rank sum test
## 
## data:  TotalWorkingYears by JobRole
## Kruskal-Wallis chi-squared = 333.98, df = 8, p-value < 2.2e-16

Reject Ho, there is overwhelming evidence that suggests TotalWorkingYears has an impact on Job Role. It appears that leadership roles tends to have employees that have higher total working years whereas entry level positions have lowere total working years.

#Age & Total Working Years impact on Job Role

cs2 %>%
  ggplot(aes(x=Age, y=TotalWorkingYears, color=JobRole)) +
  geom_point() 

cs2 %>%
  ggplot(aes(x=Age, y=TotalWorkingYears, color=JobRole)) +
  geom_point() +
  geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

There is a direct relationship on between Age & Total Working Hours on Job Role. The job roles were smoothed to the data using fit linear models. As seen in the individual models, positions of leadership tend to be of high ages and total working years.

#Education impact on Job Role

cs2 %>% 
  ggplot(aes(x=EducationF, color=JobRole, fill=JobRole)) +
  geom_bar() +
  facet_grid(rows = vars(JobRole)) + 
  scale_fill_viridis(discrete = TRUE) +
  ggtitle("Education & Job Role") + 
  xlab("Education") + ylab("") +
  theme(
  strip.background = element_blank(),
  strip.text.y = element_blank()) + 
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  coord_flip()

freqE <- cs2 %>% 
  proc_freq("JobRole", "EducationF",
            include.row_percent = FALSE,
            include.column_percent = FALSE,
            include.table_percent = FALSE,
            include.column_total = FALSE,
            include.row_total = FALSE,
            )

freqE <- cbind(freqE$body$dataset$`1`, freqE$body$dataset$`2`, freqE$body$dataset$`3`, freqE$body$dataset$`4`, freqE$body$dataset$`5`)
freqE
##       [,1] [,2] [,3] [,4] [,5]
##  [1,]   10   17   24   23    2
##  [2,]    5    8    9    4    1
##  [3,]   23   40   52   35    3
##  [4,]    4   10   18   17    2
##  [5,]   10   18   38   20    1
##  [6,]    3    7   18   16    7
##  [7,]   16   30   78   44    4
##  [8,]   14   39   70   71    6
##  [9,]   13   13   17   10    0
#Ho: Attrition is independent of Job Role
#Ha: Attriton is not independent of Job Role
cs2EJR <- chisq.test(freqE)
## Warning in chisq.test(freqE): Chi-squared approximation may be incorrect
cs2EJR
## 
##  Pearson's Chi-squared test
## 
## data:  freqE
## X-squared = 63.285, df = 32, p-value = 0.000804

Reject Ho, there is strong evidence that suggests Education has an impact on Job Role. Not knowing exactly what each of the education categories represent, it may be assumed that the level of education increases with code value. Job roles such as Sales Executative or Research Scientist would be expected to have a higher education level as compared to a Healthcare Representative or Human Resources.

#MONTHLY INCOME

#Fit all of the variables to identify those that contribute to the prediction of monthly income
#fitR = lm(MonthlyIncome~Attrition+TotalWorkingYears+JobRole+JobSatisfactionF+WorkLifeBalanceF+PercentSalaryHike+YearsSinceLastPromotion+EducationF+EducationField+EmployeeNumber+EnvironmentSatisfaction+Gender+PerformanceRatingF+Department+DistanceFromHome+Age+BusinessTravel+JobInvolvementF+JobLevelF+MaritalStatus+NumCompaniesWorked+OverTime+RelationshipSatisfactionF+StockOptionLevelF+TrainingTimesLastYear+YearsAtCompany+YearsInCurrentRole+YearsSinceLastPromotion+YearsWithCurrManager + HourlyRate + MonthlyRate + PercentSalaryHike + DailyRate, data=cs2)
#summary(fitR)


set.seed(5)
trainIndicies = sample(1:dim(cs2)[1],round(.75 * dim(cs2cmpns)[1]),replace = FALSE)
trainR = cs2[trainIndicies,]
testR = cs2[-trainIndicies,]

#Determine the optimal model of variables to fit the monthly income 
#Leave one out cross validation
ModelRfit = lm(MonthlyIncome ~ TotalWorkingYears+JobRole+JobLevel+BusinessTravel+EducationF+StockOptionLevelF+Age+Gender+OverTime+NumCompaniesWorked, data=trainR, trControl = trainControl((method = "LOOCV")))
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trControl' will be disregarded
summary(ModelRfit)
## 
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + JobRole + JobLevel + 
##     BusinessTravel + EducationF + StockOptionLevelF + Age + Gender + 
##     OverTime + NumCompaniesWorked, data = trainR, trControl = trainControl((method = "LOOCV")))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3786.5  -633.2    13.9   628.4  4134.6 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -415.087    329.841  -1.258 0.208698    
## TotalWorkingYears                 45.406     10.609   4.280 2.16e-05 ***
## JobRoleHuman Resources          -337.137    302.891  -1.113 0.266106    
## JobRoleLaboratory Technician    -650.025    192.211  -3.382 0.000765 ***
## JobRoleManager                  3995.605    261.884  15.257  < 2e-16 ***
## JobRoleManufacturing Director   -163.853    188.401  -0.870 0.384796    
## JobRoleResearch Director        4013.477    243.472  16.484  < 2e-16 ***
## JobRoleResearch Scientist       -334.351    188.288  -1.776 0.076259 .  
## JobRoleSales Executive           -35.369    160.491  -0.220 0.825647    
## JobRoleSales Representative     -422.013    241.469  -1.748 0.081007 .  
## JobLevel                        2809.779     91.618  30.668  < 2e-16 ***
## BusinessTravelTravel_Frequently  174.107    155.798   1.118 0.264201    
## BusinessTravelTravel_Rarely      373.542    132.732   2.814 0.005042 ** 
## EducationF2                     -201.361    150.197  -1.341 0.180519    
## EducationF3                     -167.674    138.465  -1.211 0.226373    
## EducationF4                     -137.291    147.288  -0.932 0.351629    
## EducationF5                     -640.731    273.806  -2.340 0.019591 *  
## StockOptionLevelF1                45.201     87.574   0.516 0.605934    
## StockOptionLevelF2              -146.268    147.816  -0.990 0.322786    
## StockOptionLevelF3               136.529    170.239   0.802 0.422866    
## Age                                2.937      6.197   0.474 0.635732    
## GenderMale                        68.361     81.732   0.836 0.403248    
## OverTimeYes                      -44.707     88.758  -0.504 0.614653    
## NumCompaniesWorked                22.731     16.632   1.367 0.172196    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1014 on 628 degrees of freedom
## Multiple R-squared:  0.9549, Adjusted R-squared:  0.9532 
## F-statistic:   578 on 23 and 628 DF,  p-value: < 2.2e-16
confint(ModelRfit)
##                                        2.5 %     97.5 %
## (Intercept)                     -1062.811787  232.63728
## TotalWorkingYears                  24.572075   66.23970
## JobRoleHuman Resources           -931.939423  257.66451
## JobRoleLaboratory Technician    -1027.478869 -272.57111
## JobRoleManager                   3481.330963 4509.87848
## JobRoleManufacturing Director    -533.825645  206.11980
## JobRoleResearch Director         3535.358905 4491.59568
## JobRoleResearch Scientist        -704.101496   35.39890
## JobRoleSales Executive           -350.533795  279.79562
## JobRoleSales Representative      -896.197997   52.17275
## JobLevel                         2629.863200 2989.69429
## BusinessTravelTravel_Frequently  -131.840956  480.05419
## BusinessTravelTravel_Rarely       112.889203  634.19415
## EducationF2                      -496.309595   93.58719
## EducationF3                      -439.584859  104.23744
## EducationF4                      -426.526625  151.94518
## EducationF5                     -1178.417704 -103.04503
## StockOptionLevelF1               -126.771930  217.17366
## StockOptionLevelF2               -436.541419  144.00546
## StockOptionLevelF3               -197.777987  470.83587
## Age                                -9.232326   15.10577
## GenderMale                        -92.140615  228.86197
## OverTimeYes                      -219.005705  129.59173
## NumCompaniesWorked                 -9.929349   55.39220
#Calculate the RSME of the model 
RSME = sqrt(mean(ModelRfit$residuals^2))
RSME
## [1] 995.3789
#Find the MSPE
ModelRPreds = predict(ModelRfit, newdata = testR)
as.data.frame(ModelRPreds)
##     ModelRPreds
## 5      2555.388
## 9      2377.410
## 13     9067.504
## 14     2616.021
## 15     2955.271
## 18     5401.720
## 19     2829.058
## 21     6023.114
## 23     4805.207
## 29     9211.908
## 33    16500.643
## 38     6007.967
## 41    16706.794
## 42     5563.804
## 46     2408.356
## 48     8612.765
## 50     2494.343
## 60     2311.995
## 61     9276.224
## 62     8878.475
## 63     2485.677
## 74     2510.807
## 75     9019.753
## 76     5182.894
## 81     6280.055
## 86    12424.973
## 88     2402.318
## 96     6665.195
## 101    2538.296
## 102    8968.668
## 105   16947.512
## 106    2556.812
## 109    6238.143
## 111    2376.805
## 112   16484.100
## 114    5987.123
## 120    6198.984
## 124    5880.195
## 136    5725.864
## 140    8952.101
## 142    2502.557
## 144    6036.339
## 151    6018.867
## 157    6253.234
## 161    5370.265
## 167   16365.541
## 168   15257.413
## 171    5716.754
## 173    9708.467
## 174    8140.047
## 175   16396.339
## 186    2711.759
## 190    5401.777
## 194    5769.220
## 195    2805.627
## 197    2575.036
## 200    2169.027
## 204    2883.221
## 205    2601.934
## 208    6644.281
## 219    2604.071
## 230   12749.602
## 242   18835.120
## 244    2627.268
## 245    6044.969
## 263    2593.205
## 266    2439.317
## 270    5765.990
## 271    5562.680
## 273    5711.876
## 276   19437.610
## 277    2489.560
## 283    6208.804
## 297    5859.724
## 298    5115.765
## 301    5826.174
## 306    2155.944
## 309    6120.286
## 315    6285.826
## 316    5865.985
## 332    9035.881
## 335   19277.347
## 336    5662.261
## 346   12161.326
## 348    5463.773
## 349    2408.141
## 350   12784.009
## 351   13492.387
## 354    5575.673
## 355    9517.657
## 359    5840.699
## 367    5286.473
## 371    5555.318
## 377    2926.677
## 378    6109.957
## 383    6492.835
## 386   13094.210
## 389    5964.525
## 398    2507.197
## 402    5552.727
## 406   16313.339
## 411    2709.940
## 416    5782.154
## 417    2209.965
## 425    5820.520
## 428    2521.502
## 430    6216.943
## 431    5727.675
## 433    5715.763
## 444    2938.633
## 451    5928.115
## 452    2758.951
## 472    2210.042
## 475    2316.992
## 485   10304.998
## 493    6096.325
## 498    2539.969
## 500    5781.991
## 504    8693.803
## 506    5794.540
## 515    2060.299
## 520    2399.425
## 526    9593.799
## 532    6328.551
## 536    2632.602
## 538    2804.916
## 541    5560.478
## 546    2842.847
## 547    9133.201
## 549   13287.609
## 550    2789.485
## 553    5692.312
## 554    3040.849
## 556    6415.308
## 560    5950.864
## 565    2346.764
## 567    5589.901
## 570   12148.857
## 571    6289.915
## 572    2563.690
## 577    3357.546
## 582   15465.615
## 583    6340.071
## 584    5828.852
## 588    2996.191
## 589    2513.366
## 595    8867.502
## 597    9022.098
## 598    5967.489
## 600    2723.442
## 603   12788.483
## 608    5839.350
## 611    5362.358
## 613    2763.155
## 615    8920.440
## 617    2622.928
## 618    5553.931
## 620    1670.008
## 621    1833.264
## 623    2895.961
## 637   16273.052
## 641    2388.009
## 650    2613.910
## 652    6195.281
## 653   13294.712
## 655    2347.022
## 657   13175.994
## 658   16308.980
## 661    5084.119
## 664    2325.830
## 669    5410.953
## 672    5719.523
## 678    5200.338
## 680    3026.800
## 682    2513.307
## 683    6058.983
## 688    2847.733
## 689    2142.830
## 690    2910.299
## 691    5606.358
## 692   12556.234
## 693    6336.301
## 695    5531.835
## 706    6042.120
## 708    2888.603
## 709    2323.404
## 712   16827.638
## 719    5990.827
## 720    8729.080
## 721    8755.275
## 722    6036.881
## 723    2686.813
## 734    5648.433
## 738    6252.169
## 741    5834.646
## 742   13078.963
## 743    5729.797
## 755   13398.838
## 765    2624.768
## 768    9018.549
## 769    6115.853
## 773    9073.715
## 781    2424.913
## 787   13092.046
## 789   19945.213
## 790    2453.169
## 795    5265.184
## 801    5625.700
## 802    6202.320
## 803   16307.615
## 813    5726.088
## 816    3157.770
## 817    5948.408
## 831    4985.476
## 836    8495.194
## 846    5399.206
## 848    9402.251
## 865    8610.787
MSPE = data.frame(Observed = testR$MonthlyIncome, Predicted = ModelRPreds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
mean(MSPE$SquaredResidual)
## [1] 1409174
#Create the Monthly Salary Predicted CSV File
cs2cmpns$EducationF = as.factor(cs2cmpns$Education)
cs2cmpns$StockOptionLevelF = as.factor(cs2cmpns$StockOptionLevel)

Case2PredictionsAdyanthayaSalary <- data.frame(cs2cmpns %>% select(ID),predict(ModelRfit, newdata=cs2cmpns))
Case2PredictionsAdyanthayaSalary <- setNames(Case2PredictionsAdyanthayaSalary, c("ID","MonthlyIncome"))
#write to csv file
write.csv(Case2PredictionsAdyanthayaSalary, "Case2PredictionsAdyanthayaSalary.csv", row.names = F)